%matplotlib inline
%config InlineBackend.figure_format='retina'
from IPython.display import display, display_markdown
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import subprocess as sp
import numpy as np
import pandas as pd
import seaborn as sns
import pymc3 as pm
import arviz as az
import bambi
import copy
import warnings
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [10, 8]
plt.rcParams['figure.dpi'] = 300
from scipy.stats import pearsonr, spearmanr
from itertools import chain
from collections import Counter
from subs2vec.utensils import log_timer
from subs2vec.vecs import Vectors
from subs2vec.neighbors import compute_nn
def display_md(md, **kwargs):
return display_markdown(md, raw=True, **kwargs)
def convert_notebook(title, output='html'):
convert = sp.run(f'jupyter nbconvert {title}.ipynb --to {output} --output {title}.{output}'.split(' '))
if convert.returncode == 0:
display_md(f'Jupyter notebook `{title}` converted successfully.')
else:
display_md(f'Error: encountered problem converting Jupyter notebook `{title}`')
def download(fname):
dl = sp.run(f'wget {fname}'.split(' '))
if dl.returncode == 0:
display_md(f'Download of `{fname}` succesful.')
else:
display_md(f'Download of `{fname}` failed.')
@log_timer
def filter_vecs(vecs, filter_words):
filtered_vecs = copy.deepcopy(vecs)
filtered_vecs.vectors = filtered_vecs.vectors[np.isin(filtered_vecs.words, filter_words)]
filtered_vecs.words = filtered_vecs.words[np.isin(filtered_vecs.words, filter_words)]
filtered_vecs.n = len(filtered_vecs.words)
display_md(f'Filtered {vecs.n} vectors, {filtered_vecs.n} remaining.')
return filtered_vecs
def norm(x):
return x / np.linalg.norm(x, 2)
sns.set(style='whitegrid')
pd.options.mode.chained_assignment = None
df = pd.read_csv('data/saysani_data.tsv', sep='\t')
display(df)
| participant | white | red | orange | yellow | green | blue | purple | brown | black | dimension | group | pp_id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 7 | 7 | 5 | 1 | 1 | 1 | 3 | 7 | cold-hot | sighted | sighted_1 |
| 1 | 1 | 7 | 1 | 4 | 2 | 3 | 3 | 6 | 6 | 7 | ripe-unripe | sighted | sighted_1 |
| 2 | 1 | 1 | 5 | 6 | 7 | 4 | 2 | 3 | 7 | 6 | new-old | sighted | sighted_1 |
| 3 | 1 | 1 | 7 | 2 | 1 | 4 | 2 | 3 | 5 | 7 | submissive-aggressive | sighted | sighted_1 |
| 4 | 1 | 1 | 7 | 6 | 1 | 2 | 2 | 5 | 3 | 5 | selfless-jealous | sighted | sighted_1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 539 | 12 | 1 | 2 | 1 | 2 | 2 | 3 | 2 | 3 | 4 | soft-hard | blind | blind_12 |
| 540 | 12 | 4 | 3 | 3 | 4 | 2 | 2 | 3 | 2 | 5 | light-heavy | blind | blind_12 |
| 541 | 12 | 2 | 4 | 2 | 1 | 2 | 1 | 2 | 3 | 2 | relaxed-tense | blind | blind_12 |
| 542 | 12 | 4 | 2 | 1 | 1 | 1 | 3 | 2 | 3 | 5 | alive-dead | blind | blind_12 |
| 543 | 12 | 6 | 7 | 4 | 3 | 4 | 4 | 1 | 2 | 5 | fast-slow | blind | blind_12 |
544 rows × 13 columns
# these are the colors in the data
colors = ['white', 'red', 'orange', 'yellow', 'green', 'blue', 'purple', 'brown', 'black']
# melt
df_orig = df.melt(
id_vars=['group', 'dimension', 'pp_id'],
value_vars=colors,
var_name='color',
value_name='rating',
)
# pull out dimension words
dimension_labels = df_orig['dimension'].unique()
dimension_pairs = [pair.split('-') for pair in dimension_labels]
dimensions = list(chain(*dimension_pairs))
# add experiment and self vs. other variables for when we add the replication experiment later
df_orig['experiment'] = 'original'
df_orig['self_vs_other'] = 'self'
display(df_orig)
| group | dimension | pp_id | color | rating | experiment | self_vs_other | |
|---|---|---|---|---|---|---|---|
| 0 | sighted | cold-hot | sighted_1 | white | 1 | original | self |
| 1 | sighted | ripe-unripe | sighted_1 | white | 7 | original | self |
| 2 | sighted | new-old | sighted_1 | white | 1 | original | self |
| 3 | sighted | submissive-aggressive | sighted_1 | white | 1 | original | self |
| 4 | sighted | selfless-jealous | sighted_1 | white | 1 | original | self |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 4891 | blind | soft-hard | blind_12 | black | 4 | original | self |
| 4892 | blind | light-heavy | blind_12 | black | 5 | original | self |
| 4893 | blind | relaxed-tense | blind_12 | black | 2 | original | self |
| 4894 | blind | alive-dead | blind_12 | black | 5 | original | self |
| 4895 | blind | fast-slow | blind_12 | black | 5 | original | self |
4896 rows × 7 columns
df_rep = pd.read_csv('data/replication1_data.csv')
# little bit of data munging, drop test participant and catch trials
df_rep = df_rep[(df_rep['pp_id'] != 3) & (df_rep['question_type'] != 'catch')]
df_rep = df_rep.drop(columns=['question_type', 'prompt_pre_1'])
# melt to long format
df_rep = df_rep.melt(
id_vars=['dimension', 'color', 'pp_id'],
value_vars=['value', 'others_choice'],
var_name='self_vs_other',
value_name='rating',
)
# more data munging
df_rep['pp_id'] = 'sighted_' + df_rep['pp_id'].astype(str)
df_rep['self_vs_other'] = df_rep['self_vs_other'].replace({'value': 'self', 'others_choice': 'other'})
df_rep['group'] = 'sighted'
df_rep['experiment'] = 'replication_1'
# there is a weird typo in one of the dimensions (?), so let's correct that here as well
df_rep['dimension'] = df_rep['dimension'].replace({'like-dis...like': 'like-dislike'})
display(df_rep)
| dimension | color | pp_id | self_vs_other | rating | group | experiment | |
|---|---|---|---|---|---|---|---|
| 0 | clean-dirty | yellow | sighted_69819 | self | 5 | sighted | replication_1 |
| 1 | soft-hard | yellow | sighted_69819 | self | 2 | sighted | replication_1 |
| 2 | ripe-unripe | yellow | sighted_69819 | self | 1 | sighted | replication_1 |
| 3 | selfless-jealous | yellow | sighted_69819 | self | 5 | sighted | replication_1 |
| 4 | high-low | yellow | sighted_69819 | self | 1 | sighted | replication_1 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 9567 | like-dislike | orange | sighted_69785 | other | 4 | sighted | replication_1 |
| 9568 | new-old | orange | sighted_69785 | other | 4 | sighted | replication_1 |
| 9569 | clean-dirty | orange | sighted_69785 | other | 5 | sighted | replication_1 |
| 9570 | relaxed-tense | orange | sighted_69785 | other | 5 | sighted | replication_1 |
| 9571 | active-passive | orange | sighted_69785 | other | 3 | sighted | replication_1 |
9572 rows × 7 columns
df_read = pd.read_csv('data/replication2_data_with_reading.csv').drop(columns=['Unnamed: 0', 'X'])
display(df_read)
| dimension | group | subj_id | color | value | question_type | others_choice | art | fiction | nonfiction | ... | Q9_17 | Q9_18 | Q9_19 | Q9_20 | Q9_21 | composite_read | upper_art | upper_fiction | upper_nonfiction | upper_read_motivation | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | cold-hot | replication-sighted | 69212 | brown | 4 | semantic_diff | 4 | 3.0 | 0.0 | 1.0 | ... | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.000000 | 0.0 | 0.0 | 1.0 | 1.0 |
| 1 | ripe-unripe | replication-sighted | 69212 | brown | 7 | semantic_diff | 6 | 3.0 | 0.0 | 1.0 | ... | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.000000 | 0.0 | 0.0 | 1.0 | 1.0 |
| 2 | new-old | replication-sighted | 69212 | brown | 6 | semantic_diff | 6 | 3.0 | 0.0 | 1.0 | ... | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.000000 | 0.0 | 0.0 | 1.0 | 1.0 |
| 3 | submissive-aggressive | replication-sighted | 69212 | brown | 2 | semantic_diff | 2 | 3.0 | 0.0 | 1.0 | ... | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.000000 | 0.0 | 0.0 | 1.0 | 1.0 |
| 4 | selfless-jealous | replication-sighted | 69212 | brown | 5 | semantic_diff | 4 | 3.0 | 0.0 | 1.0 | ... | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.000000 | 0.0 | 0.0 | 1.0 | 1.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14251 | light-heavy | replication-sighted | 68129 | red | 6 | semantic_diff | 5 | 11.0 | 0.0 | 1.0 | ... | 1.0 | 0.0 | -1.0 | 0.0 | 1.0 | -0.555556 | 1.0 | 0.0 | 1.0 | 0.0 |
| 14252 | relaxed-tense | replication-sighted | 68129 | red | 6 | semantic_diff | 5 | 11.0 | 0.0 | 1.0 | ... | 1.0 | 0.0 | -1.0 | 0.0 | 1.0 | -0.555556 | 1.0 | 0.0 | 1.0 | 0.0 |
| 14253 | alive-dead | replication-sighted | 68129 | red | 7 | semantic_diff | 6 | 11.0 | 0.0 | 1.0 | ... | 1.0 | 0.0 | -1.0 | 0.0 | 1.0 | -0.555556 | 1.0 | 0.0 | 1.0 | 0.0 |
| 14254 | fast-slow | replication-sighted | 68129 | red | 1 | semantic_diff | 3 | 11.0 | 0.0 | 1.0 | ... | 1.0 | 0.0 | -1.0 | 0.0 | 1.0 | -0.555556 | 1.0 | 0.0 | 1.0 | 0.0 |
| 14255 | high-low | replication-sighted | 68129 | red | 1 | semantic_diff | 2 | 11.0 | 0.0 | 1.0 | ... | 1.0 | 0.0 | -1.0 | 0.0 | 1.0 | -0.555556 | 1.0 | 0.0 | 1.0 | 0.0 |
14256 rows × 36 columns
df_read['reading_motivation'] = df_read.apply(
lambda x: (0
+ x['Q9_1']
+ x['Q9_2']
+ x['Q9_3']
+ x['Q9_4']
+ x['Q9_5']
+ x['Q9_6']
+ x['Q9_7']
+ x['Q9_8']
+ x['Q9_9']
+ x['Q9_10']
+ x['Q9_11']
+ x['Q9_12']
+ x['Q9_13']
+ x['Q9_14'] * -1
+ x['Q9_15']
+ x['Q9_16']
+ x['Q9_17'] * -1
+ x['Q9_18']
+ x['Q9_19']
+ x['Q9_20']
+ x['Q9_21']) / 21,
axis=1)
df_read['reading_part_of_self'] = df_read.apply(
lambda x: (0
+ x['Q9_2']
+ x['Q9_3']
+ x['Q9_4']
+ x['Q9_5']
+ x['Q9_6']
+ x['Q9_9']
+ x['Q9_10']
+ x['Q9_11']) / 8,
axis=1)
df_read['reading_efficacy'] = df_read.apply(
lambda x: (0
+ x['Q9_1']
+ x['Q9_14'] * -1
+ x['Q9_16']
+ x['Q9_17'] * -1
+ x['Q9_19']
+ x['Q9_20']) / 6,
axis=1)
df_read['reading_recognition'] = df_read.apply(
lambda x: (0
+ x['Q9_12']
+ x['Q9_13']
+ x['Q9_15']) / 3,
axis=1)
df_read['reading_other_realms'] = df_read.apply(
lambda x: (0
+ x['Q9_7']
+ x['Q9_8']
+ x['Q9_18']
+ x['Q9_21']) / 4,
axis=1)
# rename participant id column to match earlier datasets
df_read = df_read.rename(columns={'subj_id': 'pp_id'})
# melt to long format
df_read = df_read.melt(
id_vars=['dimension', 'color', 'pp_id', 'art', 'fiction', 'nonfiction', 'reading_motivation',
'reading_part_of_self', 'reading_efficacy', 'reading_recognition', 'reading_other_realms'],
value_vars=['value', 'others_choice'],
var_name='self_vs_other',
value_name='rating',
)
# more data munging
df_read['pp_id'] = 'sighted_' + df_read['pp_id'].astype(str)
df_read['self_vs_other'] = df_read['self_vs_other'].replace({'value': 'self', 'others_choice': 'other'})
df_read['group'] = 'sighted'
df_read['experiment'] = 'replication_2'
display(df_read)
| dimension | color | pp_id | art | fiction | nonfiction | reading_motivation | reading_part_of_self | reading_efficacy | reading_recognition | reading_other_realms | self_vs_other | rating | group | experiment | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | cold-hot | brown | sighted_69212 | 3.0 | 0.0 | 1.0 | 1.619048 | 2.000 | 0.666667 | 2.0 | 2.00 | self | 4 | sighted | replication_2 |
| 1 | ripe-unripe | brown | sighted_69212 | 3.0 | 0.0 | 1.0 | 1.619048 | 2.000 | 0.666667 | 2.0 | 2.00 | self | 7 | sighted | replication_2 |
| 2 | new-old | brown | sighted_69212 | 3.0 | 0.0 | 1.0 | 1.619048 | 2.000 | 0.666667 | 2.0 | 2.00 | self | 6 | sighted | replication_2 |
| 3 | submissive-aggressive | brown | sighted_69212 | 3.0 | 0.0 | 1.0 | 1.619048 | 2.000 | 0.666667 | 2.0 | 2.00 | self | 2 | sighted | replication_2 |
| 4 | selfless-jealous | brown | sighted_69212 | 3.0 | 0.0 | 1.0 | 1.619048 | 2.000 | 0.666667 | 2.0 | 2.00 | self | 5 | sighted | replication_2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 28507 | light-heavy | red | sighted_68129 | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 | other | 5 | sighted | replication_2 |
| 28508 | relaxed-tense | red | sighted_68129 | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 | other | 5 | sighted | replication_2 |
| 28509 | alive-dead | red | sighted_68129 | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 | other | 6 | sighted | replication_2 |
| 28510 | fast-slow | red | sighted_68129 | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 | other | 3 | sighted | replication_2 |
| 28511 | high-low | red | sighted_68129 | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 | other | 2 | sighted | replication_2 |
28512 rows × 15 columns
df_read.describe()
| art | fiction | nonfiction | reading_motivation | reading_part_of_self | reading_efficacy | reading_recognition | reading_other_realms | rating | |
|---|---|---|---|---|---|---|---|---|---|
| count | 27864.000000 | 27864.000000 | 27864.000000 | 27864.000000 | 27864.000000 | 27864.000000 | 27864.000000 | 27864.000000 | 28512.000000 |
| mean | 7.616279 | 0.593023 | 0.755814 | -0.107973 | -0.280523 | 0.203488 | -0.616279 | 0.151163 | 3.693147 |
| std | 6.612596 | 0.854251 | 0.987569 | 0.646210 | 0.879559 | 0.646210 | 0.910077 | 0.759909 | 1.424941 |
| min | -5.000000 | 0.000000 | 0.000000 | -1.619048 | -2.000000 | -1.000000 | -2.000000 | -2.000000 | 1.000000 |
| 25% | 3.000000 | 0.000000 | 0.000000 | -0.571429 | -1.000000 | -0.333333 | -1.333333 | -0.500000 | 3.000000 |
| 50% | 6.000000 | 0.000000 | 0.000000 | -0.119048 | -0.375000 | 0.166667 | -0.666667 | 0.250000 | 4.000000 |
| 75% | 10.000000 | 1.000000 | 1.000000 | 0.285714 | 0.250000 | 0.666667 | 0.000000 | 0.500000 | 5.000000 |
| max | 26.000000 | 4.000000 | 4.000000 | 1.619048 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 7.000000 |
corrs = df_read[['art', 'fiction', 'nonfiction', 'reading_motivation', 'reading_part_of_self',
'reading_efficacy', 'reading_recognition', 'reading_other_realms']].corr().round(2)
mask = np.zeros_like(corrs)
mask[np.triu_indices_from(mask)] = True
g = sns.heatmap(corrs, vmin=0, vmax=1, annot=True, mask=mask)
g.set_yticklabels(g.get_yticklabels(), rotation=0);
g = sns.histplot(x='art', data=df_read)
vecs = Vectors('../embeddings/fic.en.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs = filter_vecs(vecs, np.array(colors + dimensions))
vecs_dict = vecs.as_dict()
color_vecs = filter_vecs(vecs, np.array(colors))
dimension_vecs = filter_vecs(vecs, np.array(dimensions))
dimension_pair_vecs = np.vstack([norm(vecs_dict[pair[0]] - vecs_dict[pair[1]]) for pair in dimension_pairs])
[INFO] loading vectors ../embeddings/fic.en.vec [INFO] <function Vectors.__init__ at 0x13d7ac0d0> ran in 5.879 seconds
Filtered 200000 vectors, 43 remaining.
[INFO] <function filter_vecs at 0x14acad040> ran in 0.242 seconds [INFO] <function Vectors.as_dict at 0x13d7ac310> ran in 0.000 seconds
Filtered 43 vectors, 9 remaining.
[INFO] <function filter_vecs at 0x14acad040> ran in 0.000 seconds
Filtered 43 vectors, 34 remaining.
[INFO] <function filter_vecs at 0x14acad040> ran in 0.000 seconds
dimension_neighbors = compute_nn(color_vecs, dimension_vecs.vectors, dimension_vecs.words, num_neighbors=9, whole_matrix=True)
dimension_neighbors = dimension_neighbors.drop(columns=[
'neighbor -1',
'neighbor -2',
'neighbor -3',
'neighbor -4',
'neighbor -5',
'neighbor -6',
'neighbor -7',
'neighbor -8',
'neighbor -9'
]).rename(columns={'target': 'dimension'})
display(dimension_neighbors)
[INFO] <function Vectors.as_dict at 0x13d7ac310> ran in 0.000 seconds [INFO] computing analogies using whole matrix additive method [INFO] <function compute_nn at 0x13d7ac5e0> ran in 0.002 seconds
| dimension | neighbor 0 | neighbor 1 | neighbor 2 | neighbor 3 | neighbor 4 | neighbor 5 | neighbor 6 | neighbor 7 | neighbor 8 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | like | white | black | brown | yellow | orange | purple | blue | green | red |
| 1 | old | white | black | brown | yellow | orange | blue | red | green | purple |
| 2 | new | white | green | yellow | black | red | blue | purple | brown | orange |
| 3 | light | yellow | red | green | orange | blue | purple | white | brown | black |
| 4 | hard | brown | white | black | red | orange | purple | yellow | green | blue |
| 5 | dead | black | brown | white | red | green | yellow | purple | blue | orange |
| 6 | cold | blue | white | green | black | red | brown | purple | yellow | orange |
| 7 | happy | white | yellow | orange | red | brown | blue | green | purple | black |
| 8 | hot | red | yellow | black | purple | white | orange | blue | green | brown |
| 9 | heavy | black | brown | green | red | purple | yellow | blue | white | orange |
| 10 | fast | red | purple | white | blue | yellow | green | black | brown | orange |
| 11 | soft | brown | yellow | white | green | orange | blue | purple | red | black |
| 12 | clean | white | blue | black | yellow | brown | green | red | orange | purple |
| 13 | slow | red | yellow | brown | black | green | purple | white | orange | blue |
| 14 | angry | red | orange | purple | blue | white | yellow | black | brown | green |
| 15 | alive | green | black | orange | red | blue | yellow | brown | purple | white |
| 16 | sad | brown | purple | blue | white | green | black | red | yellow | orange |
| 17 | fresh | white | green | blue | yellow | brown | red | black | purple | orange |
| 18 | calm | blue | green | brown | white | purple | yellow | black | red | orange |
| 19 | dirty | white | brown | yellow | blue | black | red | green | orange | purple |
| 20 | dull | brown | green | yellow | blue | orange | red | purple | black | white |
| 21 | relaxed | brown | white | green | yellow | red | blue | orange | black | purple |
| 22 | jealous | red | purple | orange | black | green | blue | yellow | brown | white |
| 23 | tense | white | black | red | yellow | blue | brown | purple | green | orange |
| 24 | exciting | green | orange | black | brown | purple | blue | red | white | yellow |
| 25 | active | orange | black | green | yellow | red | white | brown | blue | purple |
| 26 | ripe | purple | orange | brown | green | yellow | blue | red | white | black |
| 27 | aggressive | black | brown | orange | white | yellow | red | purple | green | blue |
| 28 | stale | brown | white | yellow | black | red | orange | green | blue | purple |
| 29 | dislike | brown | green | black | blue | orange | purple | white | yellow | red |
| 30 | passive | white | black | brown | yellow | red | green | purple | orange | blue |
| 31 | selfless | blue | purple | white | black | green | red | orange | yellow | brown |
| 32 | submissive | white | brown | yellow | purple | black | blue | orange | green | red |
| 33 | unripe | orange | purple | yellow | brown | green | blue | red | white | black |
dimension_neighbors = compute_nn(color_vecs, dimension_pair_vecs, np.array(dimension_labels), num_neighbors=9, whole_matrix=True)
dimension_neighbors = dimension_neighbors.drop(columns=[
'neighbor -1',
'neighbor -2',
'neighbor -3',
'neighbor -4',
'neighbor -5',
'neighbor -6',
'neighbor -7',
'neighbor -8',
'neighbor -9'
]).rename(columns={'target': 'dimension'})
display(dimension_neighbors)
[INFO] <function Vectors.as_dict at 0x13d7ac310> ran in 0.000 seconds [INFO] computing analogies using whole matrix additive method [INFO] <function compute_nn at 0x13d7ac5e0> ran in 0.001 seconds
| dimension | neighbor 0 | neighbor 1 | neighbor 2 | neighbor 3 | neighbor 4 | neighbor 5 | neighbor 6 | neighbor 7 | neighbor 8 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | cold-hot | blue | white | green | brown | black | purple | red | yellow | orange |
| 1 | ripe-unripe | green | white | brown | black | blue | red | purple | yellow | orange |
| 2 | new-old | green | purple | yellow | red | blue | black | orange | white | brown |
| 3 | submissive-aggressive | white | blue | purple | yellow | brown | green | black | orange | red |
| 4 | selfless-jealous | white | blue | purple | brown | green | yellow | black | orange | red |
| 5 | active-passive | orange | green | blue | red | black | yellow | purple | brown | white |
| 6 | like-dislike | white | black | yellow | red | orange | brown | purple | blue | green |
| 7 | clean-dirty | white | blue | black | purple | green | red | orange | yellow | brown |
| 8 | fresh-stale | green | blue | purple | red | white | yellow | black | orange | brown |
| 9 | calm-angry | green | blue | brown | white | black | yellow | purple | orange | red |
| 10 | happy-sad | white | orange | yellow | red | blue | green | black | brown | purple |
| 11 | exciting-dull | black | white | purple | orange | red | green | blue | yellow | brown |
| 12 | soft-hard | yellow | brown | blue | green | orange | purple | white | red | black |
| 13 | light-heavy | yellow | orange | blue | red | green | white | purple | brown | black |
| 14 | relaxed-tense | brown | green | orange | yellow | white | blue | red | purple | black |
| 15 | alive-dead | orange | blue | green | purple | yellow | red | brown | black | white |
| 16 | fast-slow | white | blue | purple | orange | green | red | black | yellow | brown |
df_joint = pd.concat([df_orig, df_rep, df_read]).reset_index()
display(df_joint)
| index | group | dimension | pp_id | color | rating | experiment | self_vs_other | art | fiction | nonfiction | reading_motivation | reading_part_of_self | reading_efficacy | reading_recognition | reading_other_realms | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | sighted | cold-hot | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 1 | sighted | ripe-unripe | sighted_1 | white | 7 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 2 | sighted | new-old | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 3 | sighted | submissive-aggressive | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 4 | sighted | selfless-jealous | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 42975 | 28507 | sighted | light-heavy | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 |
| 42976 | 28508 | sighted | relaxed-tense | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 |
| 42977 | 28509 | sighted | alive-dead | sighted_68129 | red | 6 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 |
| 42978 | 28510 | sighted | fast-slow | sighted_68129 | red | 3 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 |
| 42979 | 28511 | sighted | high-low | sighted_68129 | red | 2 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 |
42980 rows × 16 columns
freqs = pd.read_csv('../datasets/dedup.en.words.unigrams.tsv', sep='\t') # not included in git repo
freqs['log_freq'] = np.log(freqs['unigram_freq'])
freqs = freqs.drop(columns='unigram_freq')
display(freqs.round(2))
| unigram | log_freq | |
|---|---|---|
| 0 | the | 17.10 |
| 1 | you | 17.06 |
| 2 | i | 17.04 |
| 3 | to | 16.78 |
| 4 | a | 16.59 |
| ... | ... | ... |
| 2397976 | tpar1 | 0.00 |
| 2397977 | giacoia | 0.00 |
| 2397978 | ourcinders | 0.00 |
| 2397979 | tourret | 0.00 |
| 2397980 | iroki | 0.00 |
2397981 rows × 2 columns
df_joint['word1'] = df_joint['dimension'].apply(lambda x: x.split('-')[0])
df_joint['word2'] = df_joint['dimension'].apply(lambda x: x.split('-')[1])
df_joint = df_joint.merge(freqs, left_on='word1', right_on='unigram', how='left')
df_joint = df_joint.merge(freqs, left_on='word2', right_on='unigram', how='left')
df_joint['frequency'] = df_joint['log_freq_x'] - df_joint['log_freq_y']
df_joint = df_joint.drop(columns=[
'unigram_x',
'unigram_y',
'log_freq_x',
'log_freq_y'
])
display(df_joint.round(2))
| index | group | dimension | pp_id | color | rating | experiment | self_vs_other | art | fiction | nonfiction | reading_motivation | reading_part_of_self | reading_efficacy | reading_recognition | reading_other_realms | word1 | word2 | frequency | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | sighted | cold-hot | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | cold | hot | -0.22 |
| 1 | 1 | sighted | ripe-unripe | sighted_1 | white | 7 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ripe | unripe | 3.49 |
| 2 | 2 | sighted | new-old | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | new | old | 0.12 |
| 3 | 3 | sighted | submissive-aggressive | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | submissive | aggressive | -2.35 |
| 4 | 4 | sighted | selfless-jealous | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | selfless | jealous | -2.96 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 42975 | 28507 | sighted | light-heavy | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | light | heavy | 1.24 |
| 42976 | 28508 | sighted | relaxed-tense | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | relaxed | tense | -0.23 |
| 42977 | 28509 | sighted | alive-dead | sighted_68129 | red | 6 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | alive | dead | -0.90 |
| 42978 | 28510 | sighted | fast-slow | sighted_68129 | red | 3 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | fast | slow | 0.76 |
| 42979 | 28511 | sighted | high-low | sighted_68129 | red | 2 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | high | low | 1.24 |
42980 rows × 19 columns
concreteness = pd.read_csv('../datasets/en-brysbaert-2014.tsv', sep='\t') # not included in git repo
display(concreteness)
| word | concreteness | |
|---|---|---|
| 0 | a | 1.46 |
| 1 | aardvark | 4.68 |
| 2 | aback | 1.65 |
| 3 | abacus | 4.52 |
| 4 | abandon | 2.54 |
| ... | ... | ... |
| 37053 | zoologist | 4.30 |
| 37054 | zoology | 3.37 |
| 37055 | zoom | 3.10 |
| 37056 | zoophobia | 2.04 |
| 37057 | zucchini | 4.87 |
37058 rows × 2 columns
df_joint = df_joint.merge(concreteness, left_on='word1', right_on='word', how='left')
df_joint = df_joint.merge(concreteness, left_on='word2', right_on='word', how='left')
df_joint['concreteness'] = df_joint['concreteness_x'] - df_joint['concreteness_y']
df_joint = df_joint.drop(columns=[
'word_x',
'word_y',
'concreteness_x',
'concreteness_y'
])
display(df_joint.round(2))
| index | group | dimension | pp_id | color | rating | experiment | self_vs_other | art | fiction | nonfiction | reading_motivation | reading_part_of_self | reading_efficacy | reading_recognition | reading_other_realms | word1 | word2 | frequency | concreteness | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | sighted | cold-hot | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | cold | hot | -0.22 | -0.46 |
| 1 | 1 | sighted | ripe-unripe | sighted_1 | white | 7 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ripe | unripe | 3.49 | -0.01 |
| 2 | 2 | sighted | new-old | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | new | old | 0.12 | 0.09 |
| 3 | 3 | sighted | submissive-aggressive | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | submissive | aggressive | -2.35 | -0.82 |
| 4 | 4 | sighted | selfless-jealous | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | selfless | jealous | -2.96 | -0.56 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 42975 | 28507 | sighted | light-heavy | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | light | heavy | 1.24 | 0.84 |
| 42976 | 28508 | sighted | relaxed-tense | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | relaxed | tense | -0.23 | 0.15 |
| 42977 | 28509 | sighted | alive-dead | sighted_68129 | red | 6 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | alive | dead | -0.90 | -0.93 |
| 42978 | 28510 | sighted | fast-slow | sighted_68129 | red | 3 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | fast | slow | 0.76 | 0.04 |
| 42979 | 28511 | sighted | high-low | sighted_68129 | red | 2 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | high | low | 1.24 | 0.12 |
42980 rows × 20 columns
swow = pd.read_csv('../datasets/SWOW-EN.R100.csv') # not included in git repo
display(swow)
| Unnamed: 0 | id | participantID | age | gender | nativeLanguage | country | education | created_at | cue | R1 | R2 | R3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 29 | 3 | 33 | Fe | United States | Australia | NaN | 2011-08-12 02:19:38 | although | nevertheless | yet | but |
| 1 | 2 | 30 | 3 | 33 | Fe | United States | Australia | NaN | 2011-08-12 02:19:38 | deal | no | cards | shake |
| 2 | 3 | 31 | 3 | 33 | Fe | United States | Australia | NaN | 2011-08-12 02:19:38 | music | notes | band | rhythm |
| 3 | 4 | 32 | 3 | 33 | Fe | United States | Australia | NaN | 2011-08-12 02:19:38 | inform | tell | rat on | NaN |
| 4 | 5 | 33 | 3 | 33 | Fe | United States | Australia | NaN | 2011-08-12 02:19:38 | way | path | via | method |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1228195 | 1228196 | 1530300 | 132506 | 29 | Ma | Canada | Australia | 5.0 | 2018-08-10 01:56:27 | strange | mask | weird | stranger |
| 1228196 | 1228197 | 1530290 | 132506 | 29 | Ma | Canada | Australia | 5.0 | 2018-08-10 01:56:27 | sunset | sea | sky | clause |
| 1228197 | 1228198 | 1530291 | 132506 | 29 | Ma | Canada | Australia | 5.0 | 2018-08-10 01:56:27 | useless | pitty | worthless | worth |
| 1228198 | 1228199 | 1530284 | 132506 | 29 | Ma | Canada | Australia | 5.0 | 2018-08-10 01:56:27 | volume | loud | music | key |
| 1228199 | 1228200 | 1530288 | 132506 | 29 | Ma | Canada | Australia | 5.0 | 2018-08-10 01:56:27 | whenever | who | where | always |
1228200 rows × 13 columns
def add_swow(df, swow, colname):
swow = pd.DataFrame(swow.groupby('cue')['resp'].value_counts()).rename(columns={'resp': 'n'})
swow = swow.reset_index()
df = df.merge(swow, left_on=['word1', 'color'], right_on=['cue', 'resp'], how='left')
df = df.merge(swow, left_on=['word2', 'color'], right_on=['cue', 'resp'], how='left')
df['n_x'] = df['n_x'].fillna(0)
df['n_y'] = df['n_y'].fillna(0)
df[colname] = df['n_x'] - df['n_y']
df = df.drop(columns=[
'cue_x',
'cue_y',
'resp_x',
'resp_y',
'n_x',
'n_y',
])
return df
swow = swow[swow['cue'].isin(dimensions)]
swow_NZ = swow[(swow['country'] == 'New Zealand')] # select only NZ respondents
swow_US = swow[(swow['country'] == 'United States')] # select only US respondents
# count only R1 (maximal discounting)
df_joint = add_swow(df_joint, swow.rename(columns={'R1': 'resp'}), 'swow_R1')
df_joint = add_swow(df_joint, swow_NZ.rename(columns={'R1': 'resp'}), 'swow_R1_NZ') # US
df_joint = add_swow(df_joint, swow_US.rename(columns={'R1': 'resp'}), 'swow_R1_US') # NZ
# count R1, R2, and R3 with equal weight (minimal discounting)
swow_all = swow.melt(
id_vars=['id', 'participantID', 'created_at', 'cue'],
value_vars=['R1', 'R2', 'R3'],
value_name='resp',
)
df_joint = add_swow(df_joint, swow_all, 'swow_all')
# NZ
swow_all_NZ = swow_NZ.melt(
id_vars=['id', 'participantID', 'created_at', 'cue'],
value_vars=['R1', 'R2', 'R3'],
value_name='resp',
)
df_joint = add_swow(df_joint, swow_all_NZ, 'swow_all_NZ')
# US
swow_all_US = swow_US.melt(
id_vars=['id', 'participantID', 'created_at', 'cue'],
value_vars=['R1', 'R2', 'R3'],
value_name='resp',
)
df_joint = add_swow(df_joint, swow_all_US, 'swow_all_US')
display(df_joint)
| index | group | dimension | pp_id | color | rating | experiment | self_vs_other | art | fiction | ... | word1 | word2 | frequency | concreteness | swow_R1 | swow_R1_NZ | swow_R1_US | swow_all | swow_all_NZ | swow_all_US | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | sighted | cold-hot | sighted_1 | white | 1 | original | self | NaN | NaN | ... | cold | hot | -0.216432 | -0.46 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | 1 | sighted | ripe-unripe | sighted_1 | white | 7 | original | self | NaN | NaN | ... | ripe | unripe | 3.485549 | -0.01 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 2 | sighted | new-old | sighted_1 | white | 1 | original | self | NaN | NaN | ... | new | old | 0.119068 | 0.09 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | 3 | sighted | submissive-aggressive | sighted_1 | white | 1 | original | self | NaN | NaN | ... | submissive | aggressive | -2.352148 | -0.82 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 4 | 4 | sighted | selfless-jealous | sighted_1 | white | 1 | original | self | NaN | NaN | ... | selfless | jealous | -2.955968 | -0.56 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 42975 | 28507 | sighted | light-heavy | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | ... | light | heavy | 1.240142 | 0.84 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 42976 | 28508 | sighted | relaxed-tense | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | ... | relaxed | tense | -0.229652 | 0.15 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 42977 | 28509 | sighted | alive-dead | sighted_68129 | red | 6 | replication_2 | other | 11.0 | 0.0 | ... | alive | dead | -0.904786 | -0.93 | -1.0 | 0.0 | 0.0 | -1.0 | 0.0 | 0.0 |
| 42978 | 28510 | sighted | fast-slow | sighted_68129 | red | 3 | replication_2 | other | 11.0 | 0.0 | ... | fast | slow | 0.763262 | 0.04 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 42979 | 28511 | sighted | high-low | sighted_68129 | red | 2 | replication_2 | other | 11.0 | 0.0 | ... | high | low | 1.237676 | 0.12 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
42980 rows × 26 columns
# check how many participants gave green as a response to various cues (to use as an example in the paper)
counts = swow_all_US.groupby(['cue', 'resp']).count().reset_index()
display(counts[counts['resp'] == 'green'])
| cue | resp | id | participantID | created_at | variable | |
|---|---|---|---|---|---|---|
| 233 | alive | green | 1 | 1 | 1 | 1 |
| 508 | clean | green | 1 | 1 | 1 | 1 |
| 1108 | exciting | green | 1 | 1 | 1 | 1 |
| 1289 | fresh | green | 1 | 1 | 1 | 1 |
| 1456 | hard | green | 1 | 1 | 1 | 1 |
| 1706 | jealous | green | 20 | 20 | 20 | 20 |
| 1984 | new | green | 1 | 1 | 1 | 1 |
| 3010 | unripe | green | 18 | 18 | 18 | 18 |
display(df_joint.sort_values('swow_all'))
| index | group | dimension | pp_id | color | rating | experiment | self_vs_other | art | fiction | ... | word1 | word2 | frequency | concreteness | swow_R1 | swow_R1_NZ | swow_R1_US | swow_all | swow_all_NZ | swow_all_US | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 25920 | 11452 | sighted | selfless-jealous | sighted_68676 | green | 2 | replication_2 | self | 4.0 | 0.0 | ... | selfless | jealous | -2.955968 | -0.56 | -19.0 | 0.0 | -10.0 | -40.0 | -1.0 | -20.0 |
| 5138 | 242 | sighted | selfless-jealous | sighted_68736 | green | 2 | replication_1 | self | NaN | NaN | ... | selfless | jealous | -2.955968 | -0.56 | -19.0 | 0.0 | -10.0 | -40.0 | -1.0 | -20.0 |
| 37530 | 23062 | sighted | selfless-jealous | sighted_67653 | green | 6 | replication_2 | other | 10.0 | 4.0 | ... | selfless | jealous | -2.955968 | -0.56 | -19.0 | 0.0 | -10.0 | -40.0 | -1.0 | -20.0 |
| 25416 | 10948 | sighted | selfless-jealous | sighted_69192 | green | 7 | replication_2 | self | 9.0 | 1.0 | ... | selfless | jealous | -2.955968 | -0.56 | -19.0 | 0.0 | -10.0 | -40.0 | -1.0 | -20.0 |
| 16956 | 2488 | sighted | selfless-jealous | sighted_68719 | green | 5 | replication_2 | self | 3.0 | 0.0 | ... | selfless | jealous | -2.955968 | -0.56 | -19.0 | 0.0 | -10.0 | -40.0 | -1.0 | -20.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 466 | 466 | blind | clean-dirty | blind_8 | white | 2 | original | self | NaN | NaN | ... | clean | dirty | 0.600633 | -1.16 | 0.0 | 0.0 | 0.0 | 8.0 | 0.0 | 7.0 |
| 14129 | 9233 | sighted | clean-dirty | sighted_68738 | white | 1 | replication_1 | other | NaN | NaN | ... | clean | dirty | 0.600633 | -1.16 | 0.0 | 0.0 | 0.0 | 8.0 | 0.0 | 7.0 |
| 33057 | 18589 | sighted | light-heavy | sighted_68150 | white | 2 | replication_2 | other | 9.0 | 0.0 | ... | light | heavy | 1.240142 | 0.84 | 1.0 | 0.0 | 1.0 | 8.0 | 0.0 | 5.0 |
| 12790 | 7894 | sighted | clean-dirty | sighted_68946 | white | 1 | replication_1 | other | NaN | NaN | ... | clean | dirty | 0.600633 | -1.16 | 0.0 | 0.0 | 0.0 | 8.0 | 0.0 | 7.0 |
| 21537 | 7069 | sighted | light-heavy | sighted_67884 | white | 2 | replication_2 | self | 5.0 | 1.0 | ... | light | heavy | 1.240142 | 0.84 | 1.0 | 0.0 | 1.0 | 8.0 | 0.0 | 5.0 |
42980 rows × 26 columns
(It looks like there very few responses from NZ, but a little more from US and elsewhere.)
def get_cosine(x, vecs_dict):
return np.dot(norm(vecs_dict.get(x['word2'], 0) - vecs_dict.get(x['word1'], 0)), vecs_dict.get(x['color'], 0))
vecs = Vectors('../embeddings/cc.en.300.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_cc'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)
[INFO] loading vectors ../embeddings/cc.en.300.vec [INFO] <function Vectors.__init__ at 0x13d7ac0d0> ran in 5.757 seconds [INFO] <function Vectors.as_dict at 0x13d7ac310> ran in 0.054 seconds
vecs = Vectors('../embeddings/subs.en.1e6.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_subs'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)
[INFO] loading vectors ../embeddings/subs.en.1e6.vec [INFO] <function Vectors.__init__ at 0x13d7ac0d0> ran in 6.154 seconds [INFO] <function Vectors.as_dict at 0x13d7ac310> ran in 0.041 seconds
# academic
vecs = Vectors('../embeddings/acad.en.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_acad'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)
# fiction
vecs = Vectors('../embeddings/fic.en.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_fic'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)
# magazines
vecs = Vectors('../embeddings/mag.en.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_mag'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)
# spoken
vecs = Vectors('../embeddings/spok.en.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_spok'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)
# news
vecs = Vectors('../embeddings/news.en.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_news'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)
display(df_joint.round(2))
[INFO] loading vectors ../embeddings/acad.en.vec [INFO] <function Vectors.__init__ at 0x7fb0731c9d40> ran in 16.646 seconds [INFO] <function Vectors.as_dict at 0x7fb0731c9f80> ran in 0.151 seconds [INFO] loading vectors ../embeddings/fic.en.vec [INFO] <function Vectors.__init__ at 0x7fb0731c9d40> ran in 16.292 seconds [INFO] <function Vectors.as_dict at 0x7fb0731c9f80> ran in 0.131 seconds [INFO] loading vectors ../embeddings/mag.en.vec [INFO] <function Vectors.__init__ at 0x7fb0731c9d40> ran in 15.646 seconds [INFO] <function Vectors.as_dict at 0x7fb0731c9f80> ran in 0.142 seconds [INFO] loading vectors ../embeddings/spok.en.vec [INFO] <function Vectors.__init__ at 0x7fb0731c9d40> ran in 15.418 seconds [INFO] <function Vectors.as_dict at 0x7fb0731c9f80> ran in 0.136 seconds [INFO] loading vectors ../embeddings/news.en.vec [INFO] <function Vectors.__init__ at 0x7fb0731c9d40> ran in 16.070 seconds [INFO] <function Vectors.as_dict at 0x7fb0731c9f80> ran in 0.155 seconds
| index | group | dimension | pp_id | color | rating | experiment | self_vs_other | art | fiction | ... | swow_all | swow_all_NZ | swow_all_US | cosine_cc | cosine_subs | cosine_acad | cosine_fic | cosine_mag | cosine_spok | cosine_news | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | sighted | cold-hot | sighted_1 | white | 1 | original | self | NaN | NaN | ... | 0.0 | 0.0 | 0.0 | 0.05 | 0.02 | 0.03 | -0.03 | -0.04 | 0.07 | 0.00 |
| 1 | 1 | sighted | ripe-unripe | sighted_1 | white | 7 | original | self | NaN | NaN | ... | 0.0 | 0.0 | 0.0 | 0.02 | 0.09 | 0.11 | 0.05 | 0.08 | -0.17 | 0.09 |
| 2 | 2 | sighted | new-old | sighted_1 | white | 1 | original | self | NaN | NaN | ... | 0.0 | 0.0 | 0.0 | 0.12 | 0.04 | 0.16 | 0.07 | 0.08 | 0.06 | 0.05 |
| 3 | 3 | sighted | submissive-aggressive | sighted_1 | white | 1 | original | self | NaN | NaN | ... | 0.0 | 0.0 | 0.0 | -0.08 | -0.08 | -0.05 | -0.07 | -0.06 | -0.06 | -0.07 |
| 4 | 4 | sighted | selfless-jealous | sighted_1 | white | 1 | original | self | NaN | NaN | ... | 0.0 | 0.0 | 0.0 | 0.05 | -0.01 | -0.00 | -0.04 | 0.03 | 0.04 | 0.11 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 42975 | 28507 | sighted | light-heavy | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | -0.16 | -0.04 | -0.07 | -0.10 | -0.14 | -0.13 | -0.05 |
| 42976 | 28508 | sighted | relaxed-tense | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.11 | -0.04 | 0.08 | -0.03 | -0.01 | 0.01 | 0.01 |
| 42977 | 28509 | sighted | alive-dead | sighted_68129 | red | 6 | replication_2 | other | 11.0 | 0.0 | ... | -1.0 | 0.0 | 0.0 | 0.15 | 0.06 | 0.04 | 0.07 | -0.03 | 0.07 | 0.06 |
| 42978 | 28510 | sighted | fast-slow | sighted_68129 | red | 3 | replication_2 | other | 11.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.00 | -0.02 | -0.06 | -0.01 | -0.06 | 0.01 | -0.02 |
| 42979 | 28511 | sighted | high-low | sighted_68129 | red | 2 | replication_2 | other | 11.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | -0.01 | 0.04 | -0.03 | -0.01 | 0.03 | 0.06 | 0.04 |
42980 rows × 33 columns
COCA embeddings, but from COCA corpora without sentences with 1st order cooccurrences (sentences with a color word and a dimension word).
def get_cosine(x, vecs_dict):
zero = np.zeros(300)
return np.dot(norm(vecs_dict.get(x['word1'], zero) - vecs_dict.get(x['word2'], zero)), vecs_dict.get(x['color'], zero))
# fiction
vecs = Vectors('../embeddings/fic.filtered.en.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_fic_filtered'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)
[INFO] loading vectors ../embeddings/fic.filtered.en.vec [INFO] <function Vectors.__init__ at 0x7fb0731c9d40> ran in 16.957 seconds [INFO] <function Vectors.as_dict at 0x7fb0731c9f80> ran in 0.138 seconds
COCA embeddings, but from training corpora from which the 100 nearest neighbors of each dimension word have been removed (in an attempt to disrupt the "scaffolding" that semantic associations with the dimension words are built on).
# fiction
vecs = Vectors('../embeddings/fic.noneighbors.en.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_fic_noneighbors'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1).fillna(0)
[INFO] loading vectors ../embeddings/fic.noneighbors.en.vec [INFO] <function Vectors.__init__ at 0x7fb0731c9d40> ran in 15.297 seconds [INFO] <function Vectors.as_dict at 0x7fb0731c9f80> ran in 0.130 seconds
COCA embeddings, but from training corpora from which the labels generated by at least two participants for color-semantic associations (e.g. the label snow for the combination white and cold) has been removed. (These nameability data are explored in more detail in a section at the end of this notebook.)
# fiction
vecs = Vectors('../embeddings/fic.nonames.en.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_fic_nonames'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1).fillna(0)
[INFO] loading vectors ../embeddings/fic.nonames.en.vec [INFO] <function Vectors.__init__ at 0x7fb0731c9d40> ran in 14.704 seconds [INFO] <function Vectors.as_dict at 0x7fb0731c9f80> ran in 0.139 seconds
df_orig = df_joint[df_joint['experiment'] == 'original']
corrs = np.abs(df_joint[[
'rating',
'cosine_cc',
'cosine_subs',
'cosine_fic',
'cosine_fic_filtered',
'cosine_fic_noneighbors',
'cosine_fic_nonames',
'swow_all',
'swow_all_NZ',
'swow_all_US',
'frequency',
'concreteness',
]].corr()).round(2)
g = sns.heatmap(corrs, vmin=0, vmax=1, annot=True)
df_rep = df_joint[df_joint['experiment'] == 'replication_2']
corrs = np.abs(df_joint[[
'rating',
'cosine_cc',
'cosine_subs',
'cosine_fic',
'cosine_fic_filtered',
'cosine_fic_noneighbors',
'cosine_fic_nonames',
'swow_all',
'swow_all_NZ',
'swow_all_US',
'frequency',
'concreteness',
]].corr()).round(2)
g = sns.heatmap(corrs, vmin=0, vmax=1, annot=True)
corrs = np.abs(df_joint[[
'rating',
'cosine_cc',
'cosine_subs',
'cosine_fic',
'cosine_fic_filtered',
'cosine_fic_noneighbors',
'cosine_fic_nonames',
'swow_all',
'swow_all_NZ',
'swow_all_US',
'frequency',
'concreteness',
]].corr()).round(2)
g = sns.heatmap(corrs, vmin=0, vmax=1, annot=True)
def standardize(Series):
return (Series - Series.mean()) / Series.std()
df_joint['art_z'] = standardize(df_joint['art'])
df_joint['fiction_z'] = standardize(df_joint['fiction'])
df_joint['nonfiction_z'] = standardize(df_joint['nonfiction'])
df_joint['reading_motivation_z'] = standardize(df_joint['reading_motivation'])
df_joint['reading_part_of_self_z'] = standardize(df_joint['reading_part_of_self'])
df_joint['reading_efficacy_z'] = standardize(df_joint['reading_efficacy'])
df_joint['reading_recognition_z'] = standardize(df_joint['reading_recognition'])
df_joint['reading_other_realms_z'] = standardize(df_joint['reading_other_realms'])
df_joint['rating_z'] = standardize(df_joint['rating'])
df_joint['frequency_z'] = standardize(df_joint['frequency'])
df_joint['concreteness_z'] = standardize(df_joint['concreteness'])
df_joint['swow_all_z'] = standardize(df_joint['swow_all'])
df_joint['swow_all_NZ_z'] = standardize(df_joint['swow_all_NZ'])
df_joint['swow_all_US_z'] = standardize(df_joint['swow_all_US'])
df_joint['swow_R1_z'] = standardize(df_joint['swow_R1'])
df_joint['swow_R1_NZ_z'] = standardize(df_joint['swow_R1_NZ'])
df_joint['swow_R1_US_z'] = standardize(df_joint['swow_R1_US'])
df_joint['cosine_cc_z'] = standardize(df_joint['cosine_cc'])
df_joint['cosine_subs_z'] = standardize(df_joint['cosine_subs'])
df_joint['cosine_acad_z'] = standardize(df_joint['cosine_acad'])
df_joint['cosine_fic_z'] = standardize(df_joint['cosine_fic'])
df_joint['cosine_mag_z'] = standardize(df_joint['cosine_mag'])
df_joint['cosine_news_z'] = standardize(df_joint['cosine_news'])
df_joint['cosine_spok_z'] = standardize(df_joint['cosine_spok'])
df_joint['cosine_fic_filtered_z'] = standardize(df_joint['cosine_fic_filtered'])
df_joint['cosine_fic_noneighbors_z'] = standardize(df_joint['cosine_fic_noneighbors'])
df_joint['cosine_fic_nonames_z'] = standardize(df_joint['cosine_fic_nonames'])
df_joint['blind'] = pd.get_dummies(df_joint['group'])['blind']
df_joint['sighted'] = pd.get_dummies(df_joint['group'])['sighted']
df_joint['group_eff'] = (df_joint['sighted'] - .5) * 2
df_joint['group_z'] = standardize(df_joint['sighted'])
df_joint['original'] = pd.get_dummies(df_joint['experiment'])['original']
df_joint['replication_1'] = pd.get_dummies(df_joint['experiment'])['replication_1']
df_joint['replication_2'] = pd.get_dummies(df_joint['experiment'])['replication_2']
df_joint['other'] = pd.get_dummies(df_joint['self_vs_other'])['other']
df_joint['self'] = pd.get_dummies(df_joint['self_vs_other'])['self']
df_joint['self_vs_other_eff'] = (df_joint['other'] - .5) * 2
df_joint['self_vs_other_z'] = standardize(df_joint['other'])
df_joint.to_csv('data_plus_predictors.tsv', sep='\t', index=False)
display(df_joint)
| index | group | dimension | pp_id | color | rating | experiment | self_vs_other | art | fiction | ... | sighted | group_eff | group_z | original | replication_1 | replication_2 | other | self | self_vs_other_eff | self_vs_other_z | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | sighted | cold-hot | sighted_1 | white | 1 | original | self | NaN | NaN | ... | 1 | 1.0 | 0.211241 | 1 | 0 | 0 | 0 | 1 | -1.0 | -0.891882 |
| 1 | 1 | sighted | ripe-unripe | sighted_1 | white | 7 | original | self | NaN | NaN | ... | 1 | 1.0 | 0.211241 | 1 | 0 | 0 | 0 | 1 | -1.0 | -0.891882 |
| 2 | 2 | sighted | new-old | sighted_1 | white | 1 | original | self | NaN | NaN | ... | 1 | 1.0 | 0.211241 | 1 | 0 | 0 | 0 | 1 | -1.0 | -0.891882 |
| 3 | 3 | sighted | submissive-aggressive | sighted_1 | white | 1 | original | self | NaN | NaN | ... | 1 | 1.0 | 0.211241 | 1 | 0 | 0 | 0 | 1 | -1.0 | -0.891882 |
| 4 | 4 | sighted | selfless-jealous | sighted_1 | white | 1 | original | self | NaN | NaN | ... | 1 | 1.0 | 0.211241 | 1 | 0 | 0 | 0 | 1 | -1.0 | -0.891882 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 42975 | 28507 | sighted | light-heavy | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | ... | 1 | 1.0 | 0.211241 | 0 | 0 | 1 | 1 | 0 | 1.0 | 1.121199 |
| 42976 | 28508 | sighted | relaxed-tense | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | ... | 1 | 1.0 | 0.211241 | 0 | 0 | 1 | 1 | 0 | 1.0 | 1.121199 |
| 42977 | 28509 | sighted | alive-dead | sighted_68129 | red | 6 | replication_2 | other | 11.0 | 0.0 | ... | 1 | 1.0 | 0.211241 | 0 | 0 | 1 | 1 | 0 | 1.0 | 1.121199 |
| 42978 | 28510 | sighted | fast-slow | sighted_68129 | red | 3 | replication_2 | other | 11.0 | 0.0 | ... | 1 | 1.0 | 0.211241 | 0 | 0 | 1 | 1 | 0 | 1.0 | 1.121199 |
| 42979 | 28511 | sighted | high-low | sighted_68129 | red | 2 | replication_2 | other | 11.0 | 0.0 | ... | 1 | 1.0 | 0.211241 | 0 | 0 | 1 | 1 | 0 | 1.0 | 1.121199 |
42980 rows × 74 columns
df_joint = pd.read_csv('data/data_plus_predictors.tsv', sep='\t')
def get_cosine_1word(x, vecs_dict):
zero = np.zeros(300)
return np.dot(vecs_dict.get(x['dimension'], zero), vecs_dict.get(x['color'], zero))
# fiction
vecs = Vectors('../embeddings/fic.en.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs_dict = vecs.as_dict()
df_names = pd.read_csv('data/color_dimension_nameability.csv')
display(df_names.head())
[INFO] loading vectors ../embeddings/fic.en.vec [INFO] <function Vectors.__init__ at 0x13d7ac0d0> ran in 6.203 seconds [INFO] <function Vectors.as_dict at 0x13d7ac310> ran in 0.056 seconds
| prompt | dimension | color | number_responses | avg_words_per_response | percent_unique_words | percent_unique_lemmas | simpson_diversity | modal_agreement | modal_names | modal_response_agreement | modal_response | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | happy_brown | happy | brown | 10 | 1.000000 | 0.800000 | 0.800000 | 0.044444 | 0.200000 | cat,puppy | 0.200000 | cat,puppy |
| 1 | unripe_brown | unripe | brown | 10 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.100000 | grape,bannana,kiwi,avocado,pear,fruit,tree,coc... | 0.100000 | grape,bannana,kiwi,avocado,pear,fruit,tree,coc... |
| 2 | hard_brown | hard | brown | 10 | 1.000000 | 0.900000 | 0.800000 | 0.044444 | 0.200000 | wood,rock | 0.200000 | wood |
| 3 | angry_blue | angry | blue | 13 | 1.076923 | 0.714286 | 0.714286 | 0.054945 | 0.230769 | shark | 0.230769 | shark |
| 4 | sad_brown | sad | brown | 10 | 1.100000 | 0.909091 | 0.909091 | 0.018182 | 0.200000 | cat | 0.200000 | cat |
# check how many participants provided labels for each color-adjective pair
print(df_names['number_responses'].min())
print(df_names['number_responses'].max())
display(df_names.sort_values('modal_agreement'))
7 13
| prompt | dimension | color | number_responses | avg_words_per_response | percent_unique_words | percent_unique_lemmas | simpson_diversity | modal_agreement | modal_names | modal_response_agreement | modal_response | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 99 | liked_blue | liked | blue | 13 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.076923 | sonic,sky,bird,pigeon,phone,smurfs,pencil,colo... | 0.076923 | sonic,sky,bird,pigeon,phone,smurfs,pencil,colo... |
| 40 | relaxed_blue | relaxed | blue | 13 | 1.076923 | 1.000000 | 1.000000 | 0.000000 | 0.076923 | smurfette,meditation,bird,water,tranquility,st... | 0.076923 | smurfette,meditation,bird,water,tranquility,st... |
| 30 | submissive_blue | submissive | blue | 13 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.076923 | macaw,nun,bird,swallow,butterfly,flower,door,b... | 0.076923 | macaw,nun,bird,swallow,butterfly,flowers,door,... |
| 91 | old_blue | old | blue | 13 | 1.076923 | 1.000000 | 1.000000 | 0.000000 | 0.076923 | bluecheese,necklace,bird,dress,shoe,smurfs,rug... | 0.076923 | bluecheese,necklace,bird,dress,shoes,smurfs,ru... |
| 192 | clean_yellow | clean | yellow | 12 | 1.083333 | 1.000000 | 1.000000 | 0.000000 | 0.083333 | table,detergant,sun,glove,hat,flag,ford,mustan... | 0.083333 | table,detergant,sun,gloves,hat,flag,ford.musta... |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 151 | clean_white | clean | white | 9 | 1.111111 | 0.700000 | 0.600000 | 0.222222 | 0.555556 | sheet | 0.333333 | sheets |
| 170 | ripe_yellow | ripe | yellow | 12 | 1.000000 | 0.500000 | 0.500000 | 0.318182 | 0.583333 | banana | 0.583333 | banana |
| 68 | cold_blue | cold | blue | 13 | 1.000000 | 0.461538 | 0.461538 | 0.358974 | 0.615385 | ice | 0.615385 | ice |
| 147 | cold_white | cold | white | 9 | 1.000000 | 0.333333 | 0.333333 | 0.583333 | 0.777778 | snow | 0.777778 | snow |
| 158 | stale_white | stale | white | 9 | 1.000000 | 0.222222 | 0.222222 | 0.777778 | 0.888889 | bread | 0.888889 | bread |
306 rows × 12 columns
names = df_names['modal_names']
names = list(chain(*[name.split(',') for name in names]))
names_all = set(names) # all unique names
names_count = Counter(names)
names_2plus = [name[0] for name in names_count.most_common() if name[1] >= 2] # all names that occur 2+ times
print(f'Number of labels named by at least 2 participants: {len(names_2plus)}')
with open('data/pair_labels_all.txt', 'w') as namesfile:
namesfile.write('\n'.join(names_all))
with open('data/pair_labels_2plus.txt', 'w') as namesfile:
namesfile.writelines('\n'.join(names_2plus))
# let's ignore words like "me", "my", and "a" though
Number of labels named by at least 2 participants: 242
Since we only have nameability for colors and dimension axis poles (i.e. for yellow and dislike but not yellow and dislike-like), we correlate nameability measures with cosine similarity between color and dimension axis pole.
pearsonr(df_names['simpson_diversity'], df_names['modal_agreement'])
PearsonRResult(statistic=0.8947743710654124, pvalue=1.816739746708339e-108)
df_names['cosine_fic'] = df_names.apply(lambda x: get_cosine_1word(x, vecs_dict), axis=1)
display(df_names.head())
x = pearsonr(df_names['cosine_fic'], df_names['simpson_diversity'])
print(f'pearsonr(cosine_fiction, simpson_diversity): {x[0]:.3f}, p-value: {x[1]:.3f}')
x = pearsonr(df_names['cosine_fic'], df_names['modal_agreement'])
print(f'pearsonr(cosine_fiction, modal_agreement): {x[0]:.3f}, p-value: {x[1]:.3f}')
g = sns.lmplot(x='cosine_fic', y='simpson_diversity', data=df_names)
g = sns.lmplot(x='cosine_fic', y='modal_agreement', data=df_names)
| prompt | dimension | color | number_responses | avg_words_per_response | percent_unique_words | percent_unique_lemmas | simpson_diversity | modal_agreement | modal_names | modal_response_agreement | modal_response | cosine_fic | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | happy_brown | happy | brown | 10 | 1.000000 | 0.800000 | 0.800000 | 0.044444 | 0.200000 | cat,puppy | 0.200000 | cat,puppy | 0.105870 |
| 1 | unripe_brown | unripe | brown | 10 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.100000 | grape,bannana,kiwi,avocado,pear,fruit,tree,coc... | 0.100000 | grape,bannana,kiwi,avocado,pear,fruit,tree,coc... | 0.322344 |
| 2 | hard_brown | hard | brown | 10 | 1.000000 | 0.900000 | 0.800000 | 0.044444 | 0.200000 | wood,rock | 0.200000 | wood | 0.211326 |
| 3 | angry_blue | angry | blue | 13 | 1.076923 | 0.714286 | 0.714286 | 0.054945 | 0.230769 | shark | 0.230769 | shark | 0.174112 |
| 4 | sad_brown | sad | brown | 10 | 1.100000 | 0.909091 | 0.909091 | 0.018182 | 0.200000 | cat | 0.200000 | cat | 0.228968 |
pearsonr(cosine_fiction, simpson_diversity): 0.214, p-value: 0.000 pearsonr(cosine_fiction, modal_agreement): 0.202, p-value: 0.000
Since we do not have human ratings for the association between colors and dimension axis poles (only for association between colors and dimension axes), we need to collapse our nameability measures for the two poles of each dimension axis. One way to do this is to compute difference scores.
df_sighted = df_joint.loc[df_joint['group'] == 'sighted']
df_sighted['diversity_word1'] = df_sighted.merge(df_names, how='left', left_on=['word1', 'color'], right_on=['dimension', 'color'])['simpson_diversity']
df_sighted['diversity_word2'] = df_sighted.merge(df_names, how='left', left_on=['word2', 'color'], right_on=['dimension', 'color'])['simpson_diversity']
df_sighted['agreement_word1'] = df_sighted.merge(df_names, how='left', left_on=['word1', 'color'], right_on=['dimension', 'color'])['modal_agreement']
df_sighted['agreement_word2'] = df_sighted.merge(df_names, how='left', left_on=['word2', 'color'], right_on=['dimension', 'color'])['modal_agreement']
df_sighted['diff_diversity'] = (df_sighted['diversity_word1'] - df_sighted['diversity_word2'])
df_sighted['diff_agreement'] = (df_sighted['agreement_word1'] - df_sighted['agreement_word2'])
df_sighted = df_sighted.dropna()
display(df_sighted.head())
df_mean_sighted = df_sighted.groupby(['color', 'dimension', 'word1', 'word2']).mean().reset_index()
df_sd_sighted = df_sighted.groupby(['color', 'dimension', 'word1', 'word2']).std().reset_index()
x = pearsonr(df_mean_sighted['rating'], df_mean_sighted['diff_diversity'])
print(f'pearsonr(rating, simpson_diversity_difference): {x[0]:.3f}, p-value: {x[1]:.3f}')
x = pearsonr(df_mean_sighted['rating'], df_mean_sighted['diff_agreement'])
print(f'pearsonr(rating, modal_agreement_difference): {x[0]:.3f}, p-value: {x[1]:.3f}')
x = pearsonr(df_mean_sighted['cosine_fic'], df_mean_sighted['diff_diversity'])
print(f'pearsonr(cosine_fiction, simpson_diversity_difference): {x[0]:.3f}, p-value: {x[1]:.3f}')
x = pearsonr(df_mean_sighted['cosine_fic'], df_mean_sighted['diff_agreement'])
print(f'pearsonr(cosine_fiction, modal_agreement_difference): {x[0]:.3f}, p-value: {x[1]:.3f}')
| index | group | dimension | pp_id | color | rating | experiment | self_vs_other | art | fiction | ... | other | self | self_vs_other_eff | self_vs_other_z | diversity_word1 | diversity_word2 | agreement_word1 | agreement_word2 | diff_diversity | diff_agreement | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 14468 | 0 | sighted | cold-hot | sighted_69212 | brown | 4 | replication_2 | self | 3.0 | 0.0 | ... | 0 | 1 | -1.0 | -0.891882 | 0.000000 | 0.012821 | 0.142857 | 0.285714 | -0.012821 | -0.142857 |
| 14469 | 1 | sighted | ripe-unripe | sighted_69212 | brown | 7 | replication_2 | self | 3.0 | 0.0 | ... | 0 | 1 | -1.0 | -0.891882 | 0.238095 | 0.035714 | 0.428571 | 0.285714 | 0.202381 | 0.142857 |
| 14470 | 2 | sighted | new-old | sighted_69212 | brown | 6 | replication_2 | self | 3.0 | 0.0 | ... | 0 | 1 | -1.0 | -0.891882 | 0.000000 | 0.000000 | 0.142857 | 0.142857 | 0.000000 | 0.000000 |
| 14471 | 3 | sighted | submissive-aggressive | sighted_69212 | brown | 2 | replication_2 | self | 3.0 | 0.0 | ... | 0 | 1 | -1.0 | -0.891882 | 0.000000 | 0.000000 | 0.142857 | 0.142857 | 0.000000 | 0.000000 |
| 14472 | 4 | sighted | selfless-jealous | sighted_69212 | brown | 5 | replication_2 | self | 3.0 | 0.0 | ... | 0 | 1 | -1.0 | -0.891882 | 0.000000 | 0.044444 | 0.142857 | 0.285714 | -0.044444 | -0.142857 |
5 rows × 80 columns
pearsonr(rating, simpson_diversity_difference): 0.036, p-value: 0.666 pearsonr(rating, modal_agreement_difference): -0.012, p-value: 0.890 pearsonr(cosine_fiction, simpson_diversity_difference): 0.058, p-value: 0.492 pearsonr(cosine_fiction, modal_agreement_difference): 0.059, p-value: 0.482
df_mean_sighted['rating_sd'] = df_sd_sighted['rating']
g = sns.lmplot(x='rating_sd', y='diff_diversity', data=df_mean_sighted)
x = pearsonr(df_mean_sighted['rating_sd'], df_mean_sighted['diff_diversity'])
print(f'pearsonr(rating_sd, simpson_diversity): {x[0]:.3f}, p-value: {x[1]:.3f}')
x = pearsonr(df_mean_sighted['rating_sd'], df_mean_sighted['diff_diversity'])
print(f'pearsonr(rating_sd, simpson_diversity): {x[0]:.3f}, p-value: {x[1]:.3f}')
pearsonr(rating_sd, simpson_diversity): 0.280, p-value: 0.001 pearsonr(rating_sd, simpson_diversity): 0.280, p-value: 0.001
One other way to work around the issue of having only color to dimension axis pole nameability is to split and invert the human ratings of color-dimension axis associations to create two scores per rating: One for the right end of the axis (equal to the rating), and one for the left end of the axis (equal to eight minus the rating). For example: If yellow is assigned a 6 on the scale dislike-like, the rating for yellow/like is 6, but we also create a rating of 2 for yellow/dislike.
df_inverse = df_sighted[[
'color',
'word1',
'rating',
'diversity_word1',
'agreement_word1'
]].rename(columns={
'word1': 'dimension',
'diversity_word1': 'simpson_diversity',
'agreement_word1': 'modal_agreement'
})
df_inverse['rating'] = 8 - df_inverse['rating']
df_inverse = pd.concat([df_inverse, df_sighted[[
'color',
'word2',
'rating',
'diversity_word2',
'agreement_word2'
]].rename(columns={
'word2': 'dimension',
'diversity_word2': 'simpson_diversity',
'agreement_word2': 'modal_agreement'
})])
display(df_inverse)
df_mean_inverse = df_inverse.groupby(['color', 'dimension']).mean().reset_index()
df_sd_inverse = df_inverse.groupby(['color', 'dimension']).std().reset_index()
x = pearsonr(df_mean_inverse['rating'], df_mean_inverse['simpson_diversity'])
print(f'pearsonr(rating, simpson_diversity): {x[0]:.3f}, p-value: {x[1]:.3f}')
x = pearsonr(df_mean_inverse['rating'], df_mean_inverse['modal_agreement'])
print(f'pearsonr(rating, modal_agreement): {x[0]:.3f}, p-value: {x[1]:.3f}')
| color | dimension | rating | simpson_diversity | modal_agreement | |
|---|---|---|---|---|---|
| 14468 | brown | cold | 4 | 0.000000 | 0.142857 |
| 14469 | brown | ripe | 1 | 0.238095 | 0.428571 |
| 14470 | brown | new | 2 | 0.000000 | 0.142857 |
| 14471 | brown | submissive | 6 | 0.000000 | 0.142857 |
| 14472 | brown | selfless | 3 | 0.000000 | 0.142857 |
| ... | ... | ... | ... | ... | ... |
| 41138 | yellow | hard | 2 | 0.000000 | 0.125000 |
| 41139 | yellow | heavy | 2 | 0.000000 | 0.125000 |
| 41140 | yellow | tense | 2 | 0.000000 | 0.125000 |
| 41141 | yellow | dead | 2 | 0.000000 | 0.125000 |
| 41142 | yellow | slow | 2 | 0.000000 | 0.125000 |
46272 rows × 5 columns
pearsonr(rating, simpson_diversity): 0.062, p-value: 0.293 pearsonr(rating, modal_agreement): 0.070, p-value: 0.237
df_mean_inverse['rating_sd'] = df_sd_inverse['rating']
g = sns.lmplot(x='rating_sd', y='modal_agreement', data=df_mean_inverse)
g = sns.lmplot(x='rating_sd', y='simpson_diversity', data=df_mean_inverse)
x = pearsonr(df_mean_inverse['rating_sd'], df_mean_inverse['simpson_diversity'])
print(f'pearsonr(rating_sd, simpson_diversity): {x[0]:.3f}, p-value: {x[1]:.3f}')
x = pearsonr(df_mean_inverse['rating_sd'], df_mean_inverse['modal_agreement'])
print(f'pearsonr(rating_sd, modal_agreement): {x[0]:.3f}, p-value: {x[1]:.3f}')
pearsonr(rating_sd, simpson_diversity): 0.228, p-value: 0.000 pearsonr(rating_sd, modal_agreement): 0.228, p-value: 0.000
In short: nameability (measured as simpson diversity and name agreement for the modal name) is weakly correlated with cosine similarity between colors and dimension axis poles, but not with human ratings, regardless of whether we fit the nameability to the ratings (by computing difference scores for the nameability measures) or fit the ratings to the nameability (by computing inverse ratings for the left poles of the dimension axes).
# reload COCA-fiction vecs
vecs = Vectors('../embeddings/fic.filtered.en.vec', n=1e6, d=300, normalize=True)
vecs_dict = vecs.as_dict()
color_vecs = filter_vecs(vecs, np.array(colors))
dimension_vecs = filter_vecs(vecs, np.array(dimensions))
dimension_pair_vecs = np.vstack([vecs_dict[pair[0]] - vecs_dict[pair[1]] for pair in dimension_pairs])
[INFO] loading vectors ../embeddings/fic.filtered.en.vec [INFO] <function Vectors.__init__ at 0x7fb0731c9d40> ran in 17.975 seconds [INFO] <function Vectors.as_dict at 0x7fb0731c9f80> ran in 0.153 seconds
Filtered 232340 vectors, 9 remaining.
[INFO] <function filter_vecs at 0x7fb0734b87a0> ran in 0.616 seconds
Filtered 232340 vectors, 34 remaining.
[INFO] <function filter_vecs at 0x7fb0734b87a0> ran in 0.801 seconds
dimension_neighbors = compute_nn(vecs, dimension_pair_vecs, np.array(dimension_labels), num_neighbors=100, whole_matrix=True)
dimension_neighbors = dimension_neighbors.rename(columns={'target': 'dimension'})
display(dimension_neighbors)
dimension_neighbors.to_csv('100_neighbors_coca_fic.tsv', sep='\t', index=False)
[INFO] <function Vectors.as_dict at 0x7fb0731c9f80> ran in 0.145 seconds [INFO] computing analogies using whole matrix additive method [INFO] <function compute_nn at 0x7fb0731c9cb0> ran in 0.951 seconds
| dimension | neighbor 0 | neighbor 1 | neighbor 2 | neighbor 3 | neighbor 4 | neighbor 5 | neighbor 6 | neighbor 7 | neighbor 8 | ... | neighbor -10 | neighbor -9 | neighbor -8 | neighbor -7 | neighbor -6 | neighbor -5 | neighbor -4 | neighbor -3 | neighbor -2 | neighbor -1 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | cold-hot | cold | coldness | chill | icy | chilling | icy_cold | chilled | colder | chilly | ... | burnin | Bellissimo | spics | hotting | hotspots | hottest | hotter | hotspot | Hot | hot |
| 1 | ripe-unripe | ripe | Forties | Belmont | Intimidating | riper | Corrupt | Proxy | high_maintenance | Facilities | ... | kopf | tablets | uncap | unread | hoar | unsay | unrisen | unre | uns | unripe |
| 2 | new-old | new | fresh | new_batch | changes | rethink | redefining | sequencing | newly | newest | ... | old_duffer | oldtimer | ancient | gnarled | Old | older | old_hag | old_farts | old_fart | old |
| 3 | submissive-aggressive | submissively | submissive | submission | submissions | submissiveness | Submission | Daughter_Lord | submit | submits | ... | aggressor | aggressiveness | aggro | Aggressive | aggression | unaggressive | aggressions | aggres | aggressively | aggressive |
| 4 | selfless-jealous | selfless | selflessly | selflessness | selfless_act | ethos | wolfless_men | self_regard | self_denial | selfserving | ... | snoop_around | jealous_rage | upset | jealousies | Jealous | jealous_type | jealousy | jeal | jealously | jealous |
| 5 | active-passive | active | activ | active_duty | actively | health_clubs | Glacier_Bay | Inactive | fund_raising_event | fund_raising | ... | Passive | impassively | graceless | impassivity | submissiveness | passivity | impassive | Impassive | passively | passive |
| 6 | like-dislike | like | Like | Titty_Twister | Feels_like | raggedy_ass | Stelly | horror_movie | papier_mch | B_movie | ... | broad_mindedness | antagonism | an_instant_dislike | intense_dislike | Dislike | mislike | disliking | disliked | dislikes | dislike |
| 7 | clean-dirty | clean | cleaned | cleancut | sterilize | reasonably | sterilized | cleanser | cleanses | cleans | ... | dirty_slush | muddy | scurrilous | dirty_jokes | grimy | filthy | grubby | irty | Dirty | dirty |
| 8 | fresh-stale | fresh | afresh | freshly | Fresh | newly | new | clean | freshly_cut | refresh | ... | stale_odor | stale_tobacco | stalemated | stale_cigarettes | staleness | stale_smoke | stale_air | stale_beer | stale_cigarette_smoke | stale |
| 9 | calm-angry | calm | calming | serene | calms | cool | some_semblance | Breathe_deep | soothing | calmness | ... | humiliated | enraged | disgusted | furious | unhappy | angry_hornets | angrier | angry_mob | Angry | angry |
| 10 | happy-sad | happy | happ | blissfully_happy | perfectly_happy | oblige | supremely_happy | delighted | deliriously_happy | thrilled | ... | woebegone | forlorn | sadness | sorrowful | wifeless | sadder | lifeless | mournfulness | mournful | sad |
| 11 | exciting-dull | exciting | Exciting | excited | excitingly | most_exciting | interesting | unexciting | excite | terribly_exciting | ... | dulling | dull_throbbing | duller | dull_sheen | dull_eyed | dully | dulled | dull_dull | dulls | dull |
| 12 | soft-hard | soft | soft_murmur | soft_moan | purr | rustle | mewing | throaty | whispery | silky | ... | dif_ficult | hardest | Hard | Difficult | difficult | centrate | harder_than | proving_difficult | harder | hard |
| 13 | light-heavy | light | illumination | dawnlight | lights | sunlight | wan_light | halogen_lights | illumine | illumi | ... | heavily_laden | heavy_wool | eavy | heavies | hefty | heavy_load | laden | thick | Heavy | heavy |
| 14 | relaxed-tense | relaxed | relaxer | relaxing | relax | Relaxed | eased | Contented | muscles_relaxed | contented | ... | final_confrontation | desperate_straits | Intense | Tense | nervous | accusatory | fearful | tenser | tenses | tense |
| 15 | alive-dead | alive | pleasurably | 58_9 | stay_alive | aloof | under_control | hardly_contain | marvelously | could_hardly_contain | ... | empty_beer_cans | muddy_ditch | fallen | Dead | graves | laundryman | grave | Dropped | deader | dead |
| 16 | fast-slow | fast | quickly | Quickly | Ouickly | rapidly | faster_faster | Faster_faster | soon | easily | ... | mesmerizing | deliberate | cadenced | quiescence | unremitting_courtesy | slow_drawl | languor | languorous | gentle | slow |
17 rows × 201 columns
df_viz = df_joint[df_joint['dimension'] != 'high-low']
df_means = df_viz.groupby(['dimension', 'color', 'word1', 'word2']).mean().reset_index()
dim_order = df_means.groupby('dimension').std().sort_values('rating', ascending=False).reset_index()['dimension']
df_means = df_means.set_index('dimension').loc[dim_order].reset_index()
mins_idx = df_means.groupby(['dimension'])['rating'].transform(min) == df_means['rating']
mins = df_means[mins_idx]
maxs_idx = df_means.groupby(['dimension'])['rating'].transform(max) == df_means['rating']
maxs = df_means[maxs_idx]
df_mins = mins[['word2', 'dimension', 'color']].merge(df_viz[['word2', 'dimension', 'color', 'rating']], how='left', on=['dimension', 'color', 'word2'])
df_maxs = maxs[['word1', 'dimension', 'color']].merge(df_viz[['word1', 'dimension', 'color', 'rating']], how='left', on=['dimension', 'color', 'word1'])
display(df_mins)
display(df_maxs)
| word2 | dimension | color | rating | |
|---|---|---|---|---|
| 0 | hot | cold-hot | blue | 1 |
| 1 | hot | cold-hot | blue | 1 |
| 2 | hot | cold-hot | blue | 1 |
| 3 | hot | cold-hot | blue | 1 |
| 4 | hot | cold-hot | blue | 2 |
| ... | ... | ... | ... | ... |
| 4555 | passive | active-passive | red | 2 |
| 4556 | passive | active-passive | red | 5 |
| 4557 | passive | active-passive | red | 2 |
| 4558 | passive | active-passive | red | 3 |
| 4559 | passive | active-passive | red | 3 |
4560 rows × 4 columns
| word1 | dimension | color | rating | |
|---|---|---|---|---|
| 0 | cold | cold-hot | red | 7 |
| 1 | cold | cold-hot | red | 7 |
| 2 | cold | cold-hot | red | 7 |
| 3 | cold | cold-hot | red | 6 |
| 4 | cold | cold-hot | red | 7 |
| ... | ... | ... | ... | ... |
| 4547 | active | active-passive | brown | 6 |
| 4548 | active | active-passive | brown | 5 |
| 4549 | active | active-passive | brown | 4 |
| 4550 | active | active-passive | brown | 4 |
| 4551 | active | active-passive | brown | 3 |
4552 rows × 4 columns
sns.set_style('darkgrid')
all_colors = {color: color for color in df_viz['color']}
fig, ax1 = plt.subplots(figsize=(3, 8))
sns.pointplot(data=df_viz, y='word1', x='rating', hue='color',
palette=all_colors, join=False, dodge=False, ax=ax1, errorbar=('ci', .95))
ax2 = ax1.twinx()
sns.pointplot(data=df_viz, y='word2', x='rating', hue='color',
palette=all_colors, join=False, dodge=False, ax=ax2, errorbar=('ci', .95))
ax1.set(ylabel='')
ax2.set(ylabel='')
ax1.get_legend().remove()
ax2.get_legend().remove()
ax1.set(xlim=[1, 7], xticks=[1, 2, 3, 4, 5, 6, 7]);
sns.set_style('whitegrid')
mins_colors = {color: color for color in mins['color']}
maxs_colors = {color: color for color in maxs['color']}
fig, ax1 = plt.subplots(figsize=(3, 7))
sns.violinplot(data=df_maxs, y='word1', x='rating', hue='color', #scale='width',
palette=maxs_colors, dodge=False, ax=ax1, inner=None, cut=0)
ax2 = ax1.twinx()
sns.violinplot(data=df_mins, y='word2', x='rating', hue='color', #scale='area',
palette=mins_colors, dodge=False, ax=ax2, inner=None, cut=0)
plt.setp(ax1.collections, alpha=.8)
plt.setp(ax2.collections, alpha=.8)
ax1.set(ylabel='')
ax2.set(ylabel='')
ax1.get_legend().remove()
ax2.get_legend().remove()
ax1.set(xlim=[1, 7], xticks=[1, 2, 3, 4, 5, 6, 7])
plt.savefig('figures/color_ratings.pdf', bbox_inches='tight')
sns.set_style('darkgrid')
df_blind = df_viz[df_viz['group'] == 'blind'].groupby(['group', 'dimension', 'color', 'word1', 'word2']).mean().reset_index()
df_sighted = df_viz[df_viz['group'] == 'sighted'].groupby(['group', 'dimension', 'color', 'word1', 'word2']).mean().reset_index()
df_scatter = pd.concat([df_blind, df_sighted])
df_scatter['colordim'] = df_scatter['color'] + '_' + df_scatter['dimension']
df_scatter = df_scatter.sort_values('cosine_fic_z')
means_colors = {row['color']: row['color'] for _, row in df_scatter.iterrows()}
g = sns.FacetGrid(df_scatter, hue='color', col='group', height=5, palette=means_colors, aspect=.5, sharex=True)
g.map(plt.scatter, 'cosine_fic_z', 'rating', s=10)
g.map(sns.regplot, 'cosine_fic_z', 'rating', scatter=False, ci=False)#, linewidth=.5)
g.set(xlabel='COCA-fiction\nembedding projection')
g.axes[0][0].set(ylabel='mean participant rating')
g.axes[0][0].set(title='blind')
g.axes[0][1].set(title='sighted')
g.set(ylim=[.75, 7.25], xlim=[-2.9, 2.9])
plt.savefig('figures/scatter_color.pdf', bbox_inches='tight')
convert_notebook('data_prep')
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-11-07966e3605ae> in <module> ----> 1 convert_notebook('data_prep') <ipython-input-1-ebaa396ce086> in convert_notebook(title, output) 33 convert = sp.run(f'jupyter nbconvert {title}.ipynb --to {output} --output {title}.{output}'.split(' ')) 34 if convert.returncode == 0: ---> 35 display_md(f'Jupyter notebook `{title}` converted successfully.') 36 else: 37 display_md(f'Error: encountered problem converting Jupyter notebook `{title}`') <ipython-input-1-ebaa396ce086> in display_md(md, **kwargs) 28 29 def display_md(md, **kwargs): ---> 30 return display_markdown(md, raw=True, **kwargs) 31 32 def convert_notebook(title, output='html'): NameError: name 'display_markdown' is not defined